{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ca665268",
   "metadata": {},
   "source": [
    "# This file replicates the regressions in PSMJ"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "id": "4e366507",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from statsmodels.iolib.summary2 import summary_col\n",
    "from linearmodels import PanelOLS"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e907975",
   "metadata": {},
   "source": [
    "#### Step 1: Loading the sample file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "id": "28e55cea",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_parquet('../input/psmjsample.parquet')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53aaa89c",
   "metadata": {},
   "source": [
    "#### Step 2: Set up regressions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "id": "82e55341",
   "metadata": {},
   "outputs": [],
   "source": [
    "dfsample = df[df['INSAMPLE']==1]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9405c068",
   "metadata": {},
   "source": [
    "#### Step 3: create sample splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "id": "ba595529",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Only check receipients\n",
    "df['ONLY CHECK'] = (df['EVER REBATE']==1) & (df['EVER EFT']==0)\n",
    "\n",
    "# Only EFT receipients\n",
    "df['ONLY EFT'] = (df['EVER REBATE']==1) & (df['EVER CHECK']==0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bea2c457",
   "metadata": {},
   "source": [
    "#### Step 4: run regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "id": "8018a9b0",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-227-43d427c5ebd9>:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  dfsample['INT_DATE_COPY'] = dfsample.index.get_level_values(1)\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=========================================================\n",
      "               d_FOODBEVS  d_SNDEXP   d_NDEXP   d_TOTEXP2\n",
      "---------------------------------------------------------\n",
      "RBTAMT         0.013      0.071      0.120**    0.518*** \n",
      "               (0.027)    (0.047)    (0.056)    (0.183)  \n",
      "AGE            0.717**    -0.226     0.663      5.541**  \n",
      "               (0.338)    (0.632)    (0.786)    (2.237)  \n",
      "d_NUM_ADULTS   153.983*** 382.112*** 483.073*** 533.099  \n",
      "               (55.560)   (102.290)  (115.729)  (387.096)\n",
      "d_PERSLT18     40.545     98.683     109.682    -311.207 \n",
      "               (45.817)   (83.660)   (103.169)  (347.452)\n",
      "R-squared      0.001      0.002      0.002      0.001    \n",
      "R-squared Adj. nan        nan        nan        nan      \n",
      "N              17304      17304      17304      17304    \n",
      "R2             0.00       0.00       0.00       0.00     \n",
      "=========================================================\n",
      "Standard errors in parentheses.\n",
      "* p<.1, ** p<.05, ***p<.01\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=========================================================\n",
      "               d_FOODBEVS  d_SNDEXP   d_NDEXP   d_TOTEXP2\n",
      "---------------------------------------------------------\n",
      "RBTAMT         0.047      0.146***   0.198***   0.685*** \n",
      "               (0.031)    (0.055)    (0.067)    (0.224)  \n",
      "AGE            0.672*     -0.015     1.310      5.995**  \n",
      "               (0.399)    (0.747)    (0.956)    (2.751)  \n",
      "d_NUM_ADULTS   130.524**  312.442*** 451.549*** -85.991  \n",
      "               (52.833)   (94.088)   (115.178)  (377.497)\n",
      "d_PERSLT18     42.017     185.477*   154.781    -383.894 \n",
      "               (62.174)   (105.882)  (131.756)  (444.790)\n",
      "R-squared      0.001      0.002      0.003      0.001    \n",
      "R-squared Adj. nan        nan        nan        nan      \n",
      "N              11154      11154      11154      11154    \n",
      "R2             0.00       0.00       0.00       0.00     \n",
      "=========================================================\n",
      "Standard errors in parentheses.\n",
      "* p<.1, ** p<.05, ***p<.01\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=========================================================\n",
      "               d_FOODBEVS  d_SNDEXP   d_NDEXP   d_TOTEXP2\n",
      "---------------------------------------------------------\n",
      "RBTAMT         0.027      0.148*     0.151      0.772*** \n",
      "               (0.043)    (0.083)    (0.097)    (0.265)  \n",
      "AGE            0.706      0.288      0.749      6.616*   \n",
      "               (0.527)    (0.939)    (1.203)    (3.402)  \n",
      "d_NUM_ADULTS   183.586*** 394.446*** 439.842*** 356.268  \n",
      "               (67.930)   (124.947)  (142.036)  (425.241)\n",
      "d_PERSLT18     72.013     282.868**  223.711    -462.338 \n",
      "               (78.638)   (141.322)  (165.620)  (601.904)\n",
      "R-squared      0.002      0.004      0.003      0.002    \n",
      "R-squared Adj. nan        nan        nan        nan      \n",
      "N              6477       6477       6477       6477     \n",
      "R2             0.00       0.00       0.00       0.00     \n",
      "=========================================================\n",
      "Standard errors in parentheses.\n",
      "* p<.1, ** p<.05, ***p<.01\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n",
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "========================================================\n",
      "               d_FOODBEVS  d_SNDEXP  d_NDEXP  d_TOTEXP2 \n",
      "--------------------------------------------------------\n",
      "RBTAMT         0.039      0.073     0.177*    0.528     \n",
      "               (0.048)    (0.086)   (0.102)   (0.419)   \n",
      "AGE            0.248      -1.076    1.611     0.521     \n",
      "               (0.632)    (1.300)   (1.713)   (4.720)   \n",
      "d_NUM_ADULTS   52.566     146.357   434.207** -1206.021*\n",
      "               (89.419)   (149.142) (202.294) (684.209) \n",
      "d_PERSLT18     4.264      57.312    117.924   -325.430  \n",
      "               (111.754)  (166.382) (234.599) (652.223) \n",
      "R-squared      0.000      0.001     0.002     0.002     \n",
      "R-squared Adj. nan        nan       nan       nan       \n",
      "N              4589       4589      4589      4589      \n",
      "R2             0.00       0.00      0.00      0.00      \n",
      "========================================================\n",
      "Standard errors in parentheses.\n",
      "* p<.1, ** p<.05, ***p<.01\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-227-43d427c5ebd9>:12: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1])\n"
     ]
    }
   ],
   "source": [
    "dfsample['INT_DATE_COPY'] = dfsample.index.get_level_values(1)\n",
    "\n",
    "# we write all the diagnostics to text as well\n",
    "with open('../output/psmjregressions.txt', 'w') as f:\n",
    "\n",
    "    for sampleselec in ['INSAMPLE', 'EVER REBATE', 'ONLY CHECK', 'ONLY EFT']:\n",
    "\n",
    "        resset = []\n",
    "\n",
    "        for depvar in ['d_FOODBEVS','d_SNDEXP','d_NDEXP','d_TOTEXP2']:\n",
    "\n",
    "            mod = PanelOLS.from_formula(depvar + ' ~   RBTAMT + AGE + d_NUM_ADULTS + d_PERSLT18 + TimeEffects ', dfsample[df[sampleselec]==1]) \n",
    "\n",
    "            res = mod.fit(cov_type='clustered', cluster_entity=True)  #\n",
    "\n",
    "            # some items are named differently for summary column to work\n",
    "            res.bse = res.std_errors\n",
    "            res.tvalues = res.tstats\n",
    "\n",
    "            res.model.exog_names = res._var_names\n",
    "            res.model.endog_names = depvar\n",
    "\n",
    "            resset.append(res)\n",
    "\n",
    "        # combine results in columns and print to text file\n",
    "        results = summary_col(resset,stars=True,float_format='%0.3f',\n",
    "                          info_dict={'N':lambda x: \"{0:d}\".format(int(x.nobs)),\n",
    "                                     'R2':lambda x: \"{:.2f}\".format(x.rsquared)})\n",
    "\n",
    "        print(results)\n",
    "        f.write(str(sampleselec) + '\\n')  \n",
    "        f.write(str(results) + '\\n\\n\\n') \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "059088a0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
